################################################################################
# Code for the analysis of the Replication
# Estimating Onsets of Binary Events in Panel Data
# Liam F. McGrath
#
# Purpose: Replicating the extreme bounds analysis of Hegre and Sambanis 06
# henceforth referred to as HS. Due to time and computational constraints I use
# a random sample of variables from the available control sets.
# All models estimated for a given variable of interested use the same
# control set to ensure to ensure comparability.
#
# This script requires the data files hs_data.dta and vars.csv located in the
# data subfolder.
################################################################################


rm(list = ls())

library(foreign)
library(ggthemes)
library(ggplot2)
library(parallel)
library(pscl)

# IMPORTANT:
# Define the path which all the code, data and results are located within
# From this changes in working directory are formed by pasting this
# string with a string for the relevant subfolder.
# If using this code be sure to change to this to your own directory
dir <- "/Users/liammcgrath/Documents/Academic/Research/transition/replication_materials/"

setwd(paste0(dir, "data"))
# Define country clustered standard errors function
# Code taken from Daina Chiba

logit.cluster <- function(fit){         # fitted object from glm
  
  F <- function(x){1/(1+exp(-x))}       # logit CDF
  f <- function(x){exp(x)/(1+exp(x))^2} # logit PDF
  fml <- fit$formula
  data <- fit$data
  beta <- fit$coef
  vcov <- vcov(fit)
  k <- length(beta)
  y <- fit$y
  m <- dim(table(data$cowcode))    # N of clusters
  xvars <- names(beta)            # Name of covariates ordered accordingly
  xvars <- xvars[2:length(xvars)] # Delete (Intercept)
  xmat <- as.matrix(data[,xvars]) # Design matrix
  xmat <- cbind(1, xmat)          # Add intercept
  xb <- xmat %*% beta             # linear predictor (xb)
  ## Now, obtain clustered s.e.
  u <- ((y==1) * f(xb)/F(xb) + (y==0) * -f(xb)/(1-F(xb)))[,1] * xmat
  u.clust <- matrix(NA, nrow=m, ncol=k)
  fc <- factor(data$cowcode)
  for (i in 1:k){ ## loop over covariates
    u.clust[,i] <- tapply(u[,i], fc, sum) ## sum over unit
  }
  cl.vcov <- vcov %*% ((m/(m-1)) * t(u.clust)%*%(u.clust)) %*% vcov ## sandwich
  cl.se <- sqrt(diag(cl.vcov)) ## clustered s.e.
  return(cl.se)
}



data <- read.dta("data_hs.dta") # Load HS replication data

data <- data[data$year >= 1960, ] # restrict sample to that used by HS


depvar <- c("warstns", "warstnsb", "atwarns") # na ongoing 1st, zero ongoing 2nd
fixed_rhs <- c("ln_popns", "t", "t2", "t3", "ln_gdpen")
fixed_rhs_eq <- "ln_popns + ln_gdpen + t + t2 + t3" # equation for fixed RHS
fixed_rhs_eq_int <- paste("ln_popns + ln_popns_lagatwarns +",
                          "ln_gdpen + ln_gdpen_lagatwarns +",
                          "t + t2 + t3") # equation for fixed RHS interacted
                                         # with the lag of war


# Vector that defines the substantive grouping of variables defined by HS
# See table 4 in appendix of Estimating Onsets of Binary Events in Panel Data
varconc <- c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,6,6,6,6,4,3,3,3,3,3,3,3,3,4,
             3,3,3,3,3,3,5,5,5,5,5,5,5,5,15,15,15,18,9,10,9,10,8,8,9,8,9,8,9,9,
             14,14,14,14,14,17,17,7,7,7 ,7,12,12,16,16,11,13,13,13,13,13,
             4,3,5,14,17,17,17,4,4)

# Variables that are of interest for their effects, defined by HS
vars <- read.csv("vars.csv")[,2]
vars <- as.character(vars)

# Data frame that contains these variable names, their category, and a number id
vary_rhs <- data.frame(varname = vars, categ = varconc, varnum = 1:88)
vary_rhs$varname <- as.character(vary_rhs$varname)

# Subset the data to include only variables needed for the estimations
subdata <- data[,c(vary_rhs$varname, depvar, fixed_rhs, "lag_atwarns", "cowcode")]

# Standardise RHS rotating vars, as done by HS
for (i in vary_rhs$varname) {
  
  subdata[, i] <- (subdata[,i] - mean(subdata[,i], na.rm = TRUE)) / sd(subdata[,i], na.rm = TRUE) 
  
}

# Now create variables interacted by lag incidence for the Markov model
for (i in names(subdata)) {
  
  todo <- paste0("subdata$",i,"_lagatwarns <- subdata[,'lag_atwarns'] * ",
                 "subdata[,'",i,"']")
  eval(parse(text = todo))
  
}

# Set up
z_var <- c("ln_popns", "ln_gdpen", "t", "t2", "t3") # fixed variables
x = vary_rhs[,1] # names of variables we are gauging sensitivity of
variables = vary_rhs
y = depvar
z = fixed_rhs_eq
s = 5000 # number of random samples from the control set
z_inter = fixed_rhs_eq_int
z_var = z_var


# So for each var, have to run the models, then store relevant info

setwd(paste0(dir,"output/"))

set.seed(12345) # seed for replicability purposes


for (j in x) {
  
  # All combinations of control variables allowed given the current rotating var
  categ <- variables[variables$varname == j, "categ"]
  controls <- expand.grid(v1 = variables$varname[variables$categ != categ],
                          v2 = variables$varname[variables$categ != categ],
                          v3 = variables$varname[variables$categ != categ])
  controls <- controls[controls$v1 != controls$v2,]
  controls <- controls[controls$v1 != controls$v3,]
  controls <- controls[controls$v2 != controls$v3,]
  
  # Sample s times without replacement from these controls
  samples <- sample(1:nrow(controls), s, replace=FALSE)
  controls <- controls[samples,]
  
  # Create a results matrix
  # cols = varname, beta, standard error, p-value, measure of fit
  results <- matrix(NA, nrow = nrow(controls), ncol = 19)
  results <- as.data.frame(results)
  names(results) <- c("varname", "beta_na", "se_na", "pvalue_na", "fit_na", "n1",
                      "beta_zero", "se_zero", "pvalue_zero", "fit_zero", "n2",
                      "beta_mark", "se_mark", "pvalue_mark",  
                      "beta_d_mark", "se_d_mark", "pvalue_d_mark", "fit_mark", "n3")
  results$varname <- j
  
  for (i in 1:nrow(controls)) {
    
    cur_data1 <- subdata[,c(y[1], j, z_var, as.character(controls[i,1]),
                            as.character(controls[i,2]),
                            as.character(controls[i,3]), "cowcode")]
    cur_data1 <- na.omit(cur_data1)
    # Ongoing years set to missing
    model_code <- paste0("model <- glm(",y[1], " ~ ", j, " + ", 
                         z, " + ", controls[i,1], " + ", 
                         controls[i,2], " + ", controls[i,3],
                         ", family = binomial(link = 'logit'), data = cur_data1)")
    eval(parse(text = model_code))
    
    if (length(coef(model)) != length(na.omit(coef(model)))) {
      
      rhs <- paste(names(na.omit(coef(model)))[-1], collapse = " + ")
      model_code <- paste0("model <- glm(",y[1], " ~ ", rhs,
                           ", family = binomial(link = 'logit'), data = cur_data1)")
      eval(parse(text = model_code))
    }
    
    # If x !%in% names have to do something
    if (j %in% names(coef(model))) {
      results[i, 2] <- summary(model)$coefficients[2,1] # beta estimate
      results[i, 3] <- logit.cluster(model)[2] # cluster se
      results[i, 4] <- 2*(1-pnorm(abs(results[i, 2]/results[i, 3]))) # p value
      
      results[i, 5] <- pR2(model)[4] # McFadden's R2 for model
      results[i,6] <- nobs(model) # number of observations
    }
    
    # Ongoing years set to zero
    cur_data2 <- subdata[,c(y[2], j, z_var, as.character(controls[i,1]),
                            as.character(controls[i,2]),
                            as.character(controls[i,3]), "cowcode")]
    cur_data2 <- na.omit(cur_data2)
    
    model_code <- paste0("model2 <- glm(",y[2], " ~ ", j, " + ", 
                         z, " + ", controls[i,1], " + ", 
                         controls[i,2], " + ", controls[i,3],
                         ", family = binomial(link = 'logit'), data = cur_data2)")
    eval(parse(text = model_code))
    
    if (length(coef(model2)) != length(na.omit(coef(model2)))) {
      
      rhs <- paste(names(na.omit(coef(model2)))[-1], collapse = " + ")
      model_code <- paste0("model2 <- glm(",y[2], " ~ ", rhs,
                           ", family = binomial(link = 'logit'), data = cur_data2)")
      eval(parse(text = model_code))
    }
    
    
    if (j %in% names(coef(model2))) {
      results[i, 7] <- summary(model2)$coefficients[2,1]
      results[i, 8] <- logit.cluster(model2)[2]
      results[i, 9] <- 2*(1-pnorm(abs(results[i, 7]/results[i, 8])))
      results[i, 10] <- pR2(model2)[4]
      results[i,11] <- nobs(model2)
    }
    
    # Markov model
    cur_data3 <- subdata[,c(y[3], j, 
                            paste0(j,"_lagatwarns"),
                            z_var,
                            paste0(z_var[1],"_lagatwarns"),
                            paste0(z_var[2],"_lagatwarns"),
                            as.character(controls[i,1]),
                            paste0(as.character(controls[i,1]),"_lagatwarns"),
                            as.character(controls[i,2]),
                            paste0(as.character(controls[i,2]),"_lagatwarns"),
                            as.character(controls[i,3]), 
                            paste0(as.character(controls[i,3]),"_lagatwarns"),
                            "lag_atwarns", "cowcode")]
    cur_data3 <- na.omit(cur_data3)
    
    model_code <- paste0("model3 <- glm(",y[3], " ~ ", j, 
                         " + ",j, "_lagatwarns + ",
                         z_inter, " + lag_atwarns + ", 
                         controls[i,1], " + ",controls[i,1], "_lagatwarns + ",
                         controls[i,2], " + ",controls[i,2], "_lagatwarns + ",
                         controls[i,3], " + ",controls[i,3], "_lagatwarns",
                         ", family = binomial(link = 'logit'), data = cur_data3)")
    eval(parse(text = model_code))
    
    if (length(coef(model3)) != length(na.omit(coef(model3)))) {
      
      rhs <- paste(names(na.omit(coef(model3)))[-1], collapse = " + ")
      model_code <- paste0("model3 <- glm(",y[3], " ~ ", rhs,
                           ", family = binomial(link = 'logit'), data = cur_data3)")
      eval(parse(text = model_code))
    }
    
    if (j %in% names(coef(model3))) {
      results[i, 12] <- summary(model3)$coefficients[2,1] # effect on onsets
      results[i, 13] <- logit.cluster(model3)[2] # se for effect on onsets
      results[i, 14] <- 2*(1-pnorm(abs(results[i, 12]/results[i, 13])))
      results[i, 15] <- summary(model3)$coefficients[3,1] # change in effect
                                                          # when y_t-1 = 1
      results[i, 16] <- logit.cluster(model3)[3]         # se for this
      results[i, 17] <- 2*(1-pnorm(abs(results[i, 15]/results[i, 16])))
      results[i, 18] <- pR2(model3)[4]
      results[i, 19] <- nobs(model3)
    }
    
    cat(".")    
    
  }
  
  # Write a csv of the results for each variable
  write.csv(results, file = paste0(j, ".csv"), row.names = FALSE)
  cat("Variable: ", j, " Complete \n")
  
  
}

######################
# Now load in all these results and save in a manageable way
#######################

setwd(paste0(dir, "data"))

# Variable categories (see table XX in the supplementary materials)
varconc <- c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,6,6,6,6,4,3,3,3,3,3,3,3,3,4,
             3,3,3,3,3,3,5,5,5,5,5,5,5,5,15,15,15,18,9,10,9,10,8,8,9,8,9,8,9,9,
             14,14,14,14,14,17,17,7,7,7 ,7,12,12,16,16,11,13,13,13,13,13,
             4,3,5,14,17,17,17,4,4)

# Variables that are of interest for their effects
vars <- read.csv("vars.csv")[,2]
vars <- as.character(vars)



# Data frame that contains variable names, their category, and a number id
vary_rhs <- data.frame(varname = vars, categ = varconc, varnum = 1:88)
vary_rhs <- data.frame(varname = vars, varnum = 1:8)
vary_rhs$varname <- as.character(vary_rhs$varname)

# Change directory to where the results of the HS are
setwd(paste0(dir, "output"))

# Load in all the results

for (i in vary_rhs[,1]) {
  
  todo <- paste0(i, " <- read.csv('", i, ".csv')")
  eval(parse(text = todo))
  
}

# Summarise info for each variable:
# - mean coeff and pvalue
# - unweighted and weighted by pseudoR2 versions

summaries <- data.frame(varname = vary_rhs[,1], beta_na = NA, beta_zero = NA,
                        pvalue_na = NA, pvalue_zero = NA,
                        beta_w_na = NA, beta_w_zero = NA,
                        pvalue_w_na = NA, pvalue_w_zero = NA,
                        beta_mark = NA, pvalue_mark = NA,
                        beta_w_mark = NA, pvalue_w_mark = NA, 
                        beta_d_mark = NA, pvalue_d_mark = NA, 
                        beta_d_w_mark = NA, pvalue_d_w_mark = NA,
                        se_w_na = NA, se_w_zero = NA, se_w_mark = NA
)
summaries$varname <- as.character(summaries$varname)

for (i in summaries[,1]) {
  
  ###### Unweighted means ######
  
  # Set to zero
  todo <- paste0("summaries$beta_zero[summaries$varname == '", i, "'] <- ",
                 "mean(", i , "$beta_zero, na.rm = TRUE)")
  eval(parse(text = todo))
  
  todo <- paste0("summaries$pvalue_zero[summaries$varname == '", i, "'] <- ",
                 "mean(", i , "$pvalue_zero, na.rm = TRUE)")
  eval(parse(text = todo))
  
  # Set to na
  todo <- paste0("summaries$beta_na[summaries$varname == '", i, "'] <- ",
                 "mean(", i , "$beta_na, na.rm = TRUE)")
  eval(parse(text = todo))
  
  todo <- paste0("summaries$pvalue_na[summaries$varname == '", i, "'] <- ",
                 "mean(", i , "$pvalue_na, na.rm = TRUE)")
  eval(parse(text = todo))
  
  # First order markov w/ incidence as LHS
  todo <- paste0("summaries$beta_mark[summaries$varname == '", i, "'] <- ",
                 "mean(", i , "$beta_mark, na.rm = TRUE)")
  eval(parse(text = todo))
  
  todo <- paste0("summaries$pvalue_mark[summaries$varname == '", i, "'] <- ",
                 "mean(", i , "$pvalue_mark, na.rm = TRUE)")
  eval(parse(text = todo))
  
  todo <- paste0("summaries$beta_d_mark[summaries$varname == '", i, "'] <- ",
                 "mean(", i , "$beta_d_mark, na.rm = TRUE)")
  eval(parse(text = todo))
  
  todo <- paste0("summaries$pvalue_d_mark[summaries$varname == '", i, "'] <- ",
                 "mean(", i , "$pvalue_d_mark, na.rm = TRUE)")
  eval(parse(text = todo))
  
  ####### Weighted means (by McFadden's pseudo-R2) #######
  
  # Set to zero
  todo <- paste0("summaries$beta_w_zero[summaries$varname == '", i, "'] <- ",
                 "weighted.mean(", i , "$beta_zero, ", i , "$fit_zero, na.rm = TRUE)")
  eval(parse(text = todo))
  
  todo <- paste0("summaries$pvalue_w_zero[summaries$varname == '", i, "'] <- ",
                 "weighted.mean(", i , "$pvalue_zero, ", i , "$fit_zero, na.rm = TRUE)")
  eval(parse(text = todo))
  
  todo <- paste0("summaries$se_w_zero[summaries$varname == '", i, "'] <- ",
                 "weighted.mean(", i , "$se_zero, ", i , "$fit_zero, na.rm = TRUE)")
  eval(parse(text = todo))
  
  
  # Set to NA
  todo <- paste0("summaries$beta_w_na[summaries$varname == '", i, "'] <- ",
                 "weighted.mean(", i , "$beta_na, ", i , "$fit_na, na.rm = TRUE)")
  eval(parse(text = todo))
  
  todo <- paste0("summaries$pvalue_w_na[summaries$varname == '", i, "'] <- ",
                 "weighted.mean(", i , "$pvalue_na, ", i , "$fit_na, na.rm = TRUE)")
  eval(parse(text = todo))
  
  todo <- paste0("summaries$se_w_na[summaries$varname == '", i, "'] <- ",
                 "weighted.mean(", i , "$se_na, ", i , "$fit_na, na.rm = TRUE)")
  eval(parse(text = todo))
  
  # First order markov with incidence
  todo <- paste0("summaries$beta_w_mark[summaries$varname == '", i, "'] <- ",
                 "weighted.mean(", i , "$beta_mark, ", i , "$fit_mark, na.rm = TRUE)")
  eval(parse(text = todo))
  
  todo <- paste0("summaries$pvalue_w_mark[summaries$varname == '", i, "'] <- ",
                 "weighted.mean(", i , "$pvalue_mark, ", i , "$fit_mark, na.rm = TRUE)")
  eval(parse(text = todo))
  
  todo <- paste0("summaries$se_w_mark[summaries$varname == '", i, "'] <- ",
                 "weighted.mean(", i , "$se_mark, ", i , "$fit_mark, na.rm = TRUE)")
  eval(parse(text = todo))
  
  todo <- paste0("summaries$beta_d_w_mark[summaries$varname == '", i, "'] <- ",
                 "weighted.mean(", i , "$beta_d_mark, ", i , "$fit_mark, na.rm = TRUE)")
  eval(parse(text = todo))
  
  todo <- paste0("summaries$pvalue_d_w_mark[summaries$varname == '", i, "'] <- ",
                 "weighted.mean(", i , "$pvalue_d_mark, ", i , "$fit_mark, na.rm = TRUE)")
  eval(parse(text = todo))
  
  
}


summaries$varconc <- varconc
save(summaries, file = "summaries.RData")
